import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
print(__version__) # requires version >= 1.9.0
import cufflinks as cf
# For Notebooks
init_notebook_mode(connected=True)
# For offline use
cf.go_offline()
df = pd.read_csv('WorldCups.csv')
df1 = pd.read_csv('WorldCupMatches.csv')
df2 = pd.read_csv('WorldCupPlayers.csv')
OVERALL WORLD CUP DATAFRAME
df.info()
#Exploring data
df.head(20)
df['Attendance'][0:3].astype(float).mean() #Only 0 to 3 work because they can be converted to floats (they only have 1 decimal)
#We have run into problem of there being periods in the data instead of commas which prevents you from doing anything with
#those numbers. The fix would be a loop through that columns which takes the current value and reads the number, any time
#a period is found, replace it with a comma
#Before writing a lambda expression to change the data, lets see if I can graph it and maybe have better insight
# sns.distplot(df['Attendance'])
#This verified there is a problem with how the original data is formatted. Need to clean this column.
def delete_periods(string):
string = string.replace(".", "")
return int(string)
df['Attendance'] = df['Attendance'].apply(lambda x: delete_periods(x))
#Testing if all numbers are now valid. Since function works and average makes sense, it is now valid and ready for analysis
df['Attendance'].mean()
#Graphing World Cup Attendance
plt.style.use('seaborn-darkgrid')
graph = df.plot.line(x='Year',y='Attendance',figsize=(16,8),lw=3)
graph.set_title("Number of World Cup Attendees");
graph.set_xlabel("Year")
#Insight: Attendance has increased over the years with the most attended world cup being in 1994, no other has been
#as attended since then
#Now I start looking to see which country has won the most
plt.style.use('fivethirtyeight')
graph = df['Winner'].hist(bins=20,figsize=(16,6))
graph.set_title("Number of World Cup Wins");
graph.set_ylabel("Games Won")
graph
plt.style.use('fivethirtyeight')
graph = df['Runners-Up'].hist(bins=30,figsize=(16,6))
graph.set_title("Number of times World Cup Runners-Up");
graph.set_ylabel("Number of times")
graph
plt.style.use('seaborn-muted')
graph = df['Third'].hist(bins=35,figsize=(20,6))
graph.set_title("Number of times World Cup Third Place");
graph.set_ylabel("Number of times")
graph
plt.style.use('seaborn-pastel')
graph = df['Fourth'].hist(bins=35,figsize=(25,6))
graph.set_title("Number of times World Cup Fourth Place");
graph.set_ylabel("Number of times")
graph
#Aggregating top 3 placements
df_first = df['Winner']
df_second = df['Runners-Up']
df_third = df['Third']
frames = [df_first,df_second,df_third]
result = pd.concat(frames)
plt.style.use('seaborn-pastel')
graph = result.hist(bins=40,figsize=(35,10))
graph.set_title("Number of World Cup Top 3 Placements");
graph.set_ylabel("Number of times")
#plt.savefig('img/top3.png', bbox_inches='tight')
#Graphing Number of Goals Scored
plt.style.use('seaborn-poster')
graph = df.plot.line(x='Year',y='GoalsScored',figsize=(16,8),lw=3)
graph.set_title("Number of Goals Scored");
graph.set_xlabel("Year")
#Graphing Changes in World Cup tournament
plt.style.use('seaborn-deep')
plt.figure(figsize=(16,8))
plt.plot( 'Year', 'QualifiedTeams', data=df, linewidth=2,label="Number of Teams")
plt.plot( 'Year', 'MatchesPlayed', data=df, linewidth=2,label="Number of Matches Played")
plt.title("Changes in World Cup");
plt.legend()
#df.plot(x='Year',y='QualifiedTeams')
#df.plot(x='Year',y='MatchesPlayed')
#plt.show()
#Insight: On the surface it seems that the number of goals increases as time went on, most likely because we are becoming
#more powerful super humans. Unfortunately that isn't the case and it is explained by the changes to the tournament
#increase of number of teams and matches played
MATCHES DATA FRAME
df1.info() #Matches
#Question to answer: Whether more goals were scored after half-time?
df1.head()
ht_goals = df1['Home Team Goals'].sum() #home team goals total
at_goals = df1['Away Team Goals'].sum() #away team goals total
htht_goals = df1['Half-time Home Goals'].sum() #before half time home team goals
htat_goals = df1['Half-time Away Goals'].sum() #before half time away team goals
ahtht_goals = ht_goals - htht_goals
ahtht_goals #after half time home team goals
ahtat_goals = at_goals - htat_goals
ahtat_goals #after half time away team goals
#Creating a dataframe of the individual metrics
goalframe = pd.DataFrame(data={'Before Half-Time': [htht_goals, htat_goals], 'After Half-Time': [ahtht_goals, ahtat_goals]}, columns=['Before Half-Time','After Half-Time'], index=['Home','Away'])
goalframe
goalframe.iplot(kind='bar', barmode='stack', filename='cufflinks/grouped-bar-chart', title="Total World Cup Goals", yTitle="Goals Scored")
PLAYERS DATAFRAME
df2.info() #Players
df2.head()